{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "在Python中，通常使用quantile函数做等比分箱，使用cut函数做等宽分箱"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import pandas as pd\n",
    "\n",
    "#按均匀分布生成100个介于10到100之间的实数\n",
    "tmp = pd.Series([random.uniform(10, 100) for x in range(100)])\n",
    "\n",
    "#1.使用pd.Series下的quantile函数进行等比分箱，此处将数据分成4份\n",
    "x = tmp.quantile(q=[0.25,0.5,0.75, 1])\n",
    "x.index = ['A','B','C','D']\n",
    "tmp_quantile = tmp.apply(lambda m: x[x>=m].index[0]).values\n",
    "\n",
    "#...另外常可通过均值、中位数、最大最小值来平滑数值以生成新的特征，这里拿均值来举例\n",
    "y = tmp.groupby(tmp_quantile).mean()\n",
    "tmp_qmean = [y[x] for x in tmp_quantile]\n",
    "\n",
    "#2.使用cut函数进行等宽分箱，此处将数据分成5份\n",
    "tmp_cut = pd.cut(tmp,bins=5,labels=[\"B1\",\"B2\",\"B3\",\"B4\",\"B5\"])\n",
    "\n",
    "#...另外可通过设置labels为NULL，并通过levles函数查看cut的水平\n",
    "#...进一步确定各分箱的取值区间\n",
    "#...可通过均值、中位数、最大最小值来平滑数值以生成新的特征，这里拿均值来举例\n",
    "z = tmp.groupby(tmp_cut).mean()\n",
    "tmp_cmean = [z[x] for x in tmp_cut]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用Python基于iris数据集，以Species为目标变量，说明特征Sepal.Length的离散化过程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.5"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "iris = pd.read_csv(\"http://image.cador.cn/data/iris.csv\")\n",
    "\n",
    "def gains(u, v):\n",
    "    ent_u = [np.sum([p*np.log2(1/p) for p in ct/np.sum(ct)]) for ct in [np.unique(u,return_counts=True)[1]]][0]\n",
    "    v_id,v_ct = np.unique(v,return_counts=True)\n",
    "    ent_u_m = [np.sum([p*np.log2(1/p) for p in ct/np.sum(ct)]) for ct in [np.unique(u[v==m],return_counts=True)[1] for m in v_id]]\n",
    "    return ent_u - np.sum(np.array(ent_u_m)*(v_ct/np.sum(v_ct)))\n",
    "\n",
    "def get_split_value(u,x):\n",
    "    sorted_x, max_gains, e_split = np.sort(x), 0, min(x)\n",
    "    for e in sorted_x:\n",
    "        tmp = np.zeros(len(x))\n",
    "        tmp[x>e]=1\n",
    "        tmp_gain = gains(u,tmp)\n",
    "        if tmp_gain > max_gains:\n",
    "            max_gains,e_split  = tmp_gain, e\n",
    "    return e_split\n",
    "\n",
    "get_split_value(iris.Species,iris['Sepal.Length'].values)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
